#include "config.h"

#include <time.h>
#include <string.h>
#include <errno.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/mount.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/types.h>
#include <dirent.h>
#include <numaif.h>

#define DBG_SUBSYS S_LIBYLIB

#include "configure.h"
#include "mem_hugepage.h"
#include "ylock.h"
#include "sysy_lib.h"
#include "variable.h"
#include "dbg.h"
#include "net_global.h"
#include "../../schedule/cpuset.h"
#include "core.h"
#include "spdk.h"
#include "buddy.h"
#include "cpuset.h"

#define STATIC_STOR_AREA_OFFSET  (4096)
#define HUGEPAGE_OFFSET          (16UL)

#define HUGEPAGE_DIR  "/dev/hugepages"
//#define HUGEPAGE_PROC_FILE  "/proc/sys/vm/nr_hugepages" /*this is big trap..*/
#define HUGEPAGE_PROC_FILE  "/sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages"


/**
 *  mem struct:
 *
 *
 *     |<------------------------ len:  (gloconf.memcache_count + 16) * MEMPAGE_SIZE ----------------------------->|
 *
 *       static_stor_area
 *     |_______|___struct mem_area_desc_t___|______struct buddy_____|_____________________________________________|
 *     |                                    |                       |
 *start_addr                           buddy_addr              buddy_mem_addr
 *
 *
 *
 * Note:
 *      mem->static_stor_area = mem->start_addr + STATIC_STOR_AREA_OFFSET;
 *      mem->buddy_addr = mem->static_stor_area
 *                         + sizeof(struct mem_area_desc_t)
 *                         + sizeof(struct mempage_t) * gloconf.memcache_count;
 *
 *      mem->buddy_mem_addr = addr + ((gloconf.memcache_count + 16)
 *                            - gloconf.memcache_count) * MEMPAGE_SIZE;
 *
 *
 * 目前分为两种调用：
 * 1.Non polling core:  global_mem_init
 * 2.Polling core:      get_global_private_mem
 **/


struct mempage_t {
        struct list_head list;
        uint64_t phyaddr;
        void     *virtaddr;
        int     idx;
        int     ref;
        int     offset;
        sy_spinlock_t lock;
};

struct mem_area_desc_t {
        struct list_head buffer_io_free_list;
        struct list_head buffer_io_alloc_list;

        void             *start_addr;

        void             *static_stor_area;
        size_t           static_current_offset;
        size_t           static_stor_area_len;

        void             *buddy_mem_addr;
        void             *buddy_addr;

        sy_spinlock_t    lock;

        int core_hash;
        int buddy_alloc_count;
        int buffer_io_alloc_count;
        int non_io_allco_count;
        size_t free_mempage_count;
        struct mempage_t mempages[0];
};

struct private_cpunode_mem_alloc_info_t {
        void *__addr__;
        int  socketid;
        sy_spinlock_t lock;
};

static struct mem_area_desc_t *__mem_info__;
static void *__private_mempages_addr__ = NULL;

struct private_cpunode_mem_alloc_info_t *__private_cpunode_mem_alloc_info__ = NULL;
int *mem_node_info = NULL;

static inline struct mem_area_desc_t *mem_self()
{
        struct mem_area_desc_t *mem;
        mem = variable_get(VARIABLE_HUGEPAGE);
        if (unlikely(!mem)) {
                mem = __mem_info__;
        }
        return mem;
}

/*static int __change_mem_priv(void *addr, size_t size, int flag)
{
        int ret;

        ret = mprotect(addr, size, flag);
        if(ret)
                GOTO(err_ret, ret);

        return 0;
err_ret:
        return ret;
} */

static int __get_socket_id(int cpu_id, int *socket_id)
{
        char path[128], *nodestr;
        int ret;
        DIR *dir;
        struct dirent debuf, *de;

        snprintf(path, 128, "%s%d", "/sys/devices/system/cpu/cpu", cpu_id);
        DINFO("the path is %s\n", path);
        dir = opendir(path);
        if(dir == NULL){
                ret = errno;
                GOTO(err_ret, ret);
        }

        *socket_id = -1;

        while(1) {
                ret = readdir_r(dir, &debuf, &de);
                if (ret < 0){
                        ret = errno;
                        GOTO(err_close, ret);
                }

                if (de == NULL){
                        break;
                }

                nodestr = strstr(de->d_name, "node");
                if (nodestr != NULL) {
                        if(strlen(nodestr) != 5){
                                continue;
                        }

                        nodestr += 4;

                        *socket_id = atoi(nodestr);
                        break;
                }
        }

        closedir(dir);

        if(*socket_id == -1) {
                //DWARN("get numa information failed, switch to compatibility mode.\r\n");
                *socket_id = 0;
        }

        return 0;
err_close:
        closedir(dir);
err_ret:
        return ret;
}

static uint64_t __virt2phy(const void *virtaddr)
{
        int fd;
        uint64_t page, physaddr;
        unsigned long virt_pfn;
        int page_size;
        off_t offset;

        /* standard page size */
        page_size = 4096;

        fd = open("/proc/self/pagemap", O_RDONLY);
        if (fd < 0) {
                exit(0);
        }

        virt_pfn = (unsigned long)virtaddr / page_size;
        offset = sizeof(uint64_t) * virt_pfn;
        if (lseek(fd, offset, SEEK_SET) == (off_t) -1) {
                close(fd);
                exit(0);
        }

        if (read(fd, &page, sizeof(uint64_t)) < 0) {
                close(fd);
                exit(0);
        }

        physaddr = ((page & 0x7fffffffffffffULL) * page_size)
                + ((unsigned long)virtaddr % page_size);

        close(fd);

        return physaddr;
}

static int  __find_numasocket(void *virtaddr, int *socketid)
{
        int socket_id;
        char *end, *nodestr;
        uint64_t virt_addr;
        char buf[BUFSIZ];
        FILE *f;

        f = fopen("/proc/self/numa_maps", "r");
        if (f == NULL) {
                DWARN("cannot open /proc/self/numa_maps,"
                                " consider that all memory is in socket_id 0\n");
                return 0;
        }


        while (fgets(buf, sizeof(buf), f) != NULL) {

                if (strstr(buf, " huge ") == NULL &&
                                strstr(buf, "/dev/hugepages") == NULL)
                        continue;

                virt_addr = strtoull(buf, &end, 16);
                if (virt_addr == 0 || end == buf) {
                        DERROR("%s(): error in numa_maps parsing\n", __func__);
                        goto error;
                }

                nodestr = strstr(buf, " N");
                if (nodestr == NULL) {
                        DERROR("%s(): error in numa_maps parsing\n", __func__);
                        goto error;
                }
                nodestr += 2;
                end = strstr(nodestr, "=");
                if (end == NULL) {
                        DERROR("%s(): error in numa_maps parsing\n", __func__);
                        goto error;
                }
                end[0] = '\0';
                end = NULL;

                socket_id = strtoul(nodestr, &end, 0);
                if ((nodestr[0] == '\0') || (end == NULL) || (*end != '\0')) {
                        DERROR("%s(): error in numa_maps parsing\n", __func__);
                        goto error;
                }

                /* if we find this page in our mappings, set socket_id */
                void *va = (void *)(unsigned long)virt_addr;
                if (virtaddr == va) {
                        *socketid = socket_id;
                        break;
                }
        }


        fclose(f);
        return 0;

error:
        fclose(f);
        return -1;
}

void* register_private_static_stor_area(void *private_addr, size_t size, int type)
{
        void  **private_start_addr = (void **)private_addr;
        struct mem_area_desc_t *mem ;

        YASSERT(private_addr);

        mem = private_start_addr[VARIABLE_HUGEPAGE];
        private_start_addr[type] = mem->static_stor_area + mem->static_current_offset;

        mem->static_current_offset += size;
        mem->static_stor_area_len += size ;

        YASSERT((uint64_t)mem->static_stor_area + mem->static_current_offset <  ((uint64_t)private_start_addr + 8 * MEMPAGE_SIZE - 4096));
        YASSERT(mem->static_stor_area_len <=  16 * 1024 * 1024);
        return private_start_addr[type];
}

static void *__cpunode_mem_alloc_init(size_t *mem_size, size_t hugepages_num)
{
        int cpunode_count;
        size_t pages_number_per_cpunode;
        size_t mem_size_per_cpu_node;
        struct  private_cpunode_mem_alloc_info_t *info = NULL;

        cpunode_count = get_cpunode_count();

        pages_number_per_cpunode = (gloconf.memcache_count + HUGEPAGE_OFFSET) * (cpuset_useable() / cpunode_count);

        YASSERT(pages_number_per_cpunode  == (hugepages_num / cpunode_count));
        mem_size_per_cpu_node = pages_number_per_cpunode * MEMPAGE_SIZE;

        __private_cpunode_mem_alloc_info__ = malloc(sizeof(struct private_cpunode_mem_alloc_info_t) * cpunode_count);

        memset(__private_cpunode_mem_alloc_info__, 0x00, sizeof(struct private_cpunode_mem_alloc_info_t) * cpunode_count);
        info = __private_cpunode_mem_alloc_info__;

        *mem_size = mem_size_per_cpu_node;

        return  info;
}

static int __lookup_available_hugepage(int socketid)
{
        int i;
        int total_hugepage_num = gloconf.memcache_count +
                                 (gloconf.memcache_count + 16) * cpuset_useable();

        for (i = 0; i < total_hugepage_num; i++) {
                if (mem_node_info[i] == socketid) {
                        mem_node_info[i] = -1;
                        return i;
                }
        }

        YASSERT(0);

        return -1;
}

static int __alloc_hugepages(void *_addr, int _socketid, int hugepages_num)
{
        int i,ret, fd, index, socketid;
        char path[64];
        int cpunode_count = get_cpunode_count();
        void  *addr, *tmp = _addr;
        int tmp_socket = _socketid;

        YASSERT(hugepages_num % cpunode_count == 0);

        for (i = 0; i < hugepages_num; i++) {

                if (_socketid == -1) {
                        tmp_socket = i % cpunode_count;
                }

                index = __lookup_available_hugepage(tmp_socket);

                snprintf(path, 64, "%s/lich_mempages_%d", HUGEPAGE_DIR, index);
                fd = open(path, O_RDWR);
                if (fd < 0) {
                        ret = errno;
                        DERROR("error %s on open %s\n", strerror(ret), path);
                        GOTO(err_ret, ret);
                }

                addr = mmap(tmp, MEMPAGE_SIZE, PROT_WRITE | PROT_READ, MAP_SHARED|MAP_FIXED | MAP_LOCKED, fd, 0);
                if (addr == MAP_FAILED) {
                        ret =errno;
                        DERROR("error on mmap %s\n", strerror(ret));
                        GOTO(err_fd, ret);
                }

                // TODO performance?
                memset(addr, 0x00, MEMPAGE_SIZE);

                tmp += MEMPAGE_SIZE;

                (void) socketid;
                /* main loop hold wait too long */
                /*
                ret = __find_numasocket(addr, &socketid);
                if (ret) {
                        DERROR("error on find numasocket \n");
                        GOTO(err_fd, ret);
                }

                if (socketid != tmp_socket) {
                        DERROR("socketid error\n");
                        YASSERT(0);
                }
                */

                close(fd);
        }

        return 0;
err_fd:
        close(fd);
err_ret:
        return ret;
}

/*all polling core memory allocate*/
int global_private_mem_init()
{
        uint64_t total_private_mempages = cpuset_useable() * (gloconf.memcache_count + HUGEPAGE_OFFSET);
        int ret;
        void *addr = NULL, *tmp_addr ;

        /*because malloc func maybe return a address is not align with 2MB, need malloc mempages number
         *more than total_private_mempages one*/
        addr = malloc(MEMPAGE_SIZE * (total_private_mempages + 1));
        if (addr == NULL) {
                DERROR("malloc __private_mempages_addr__ failed, errno: %d\n", errno);
                ret = ENOMEM;
                GOTO(err_ret, ret);
        }

        /*adjust addr 2MB align*/
        tmp_addr = addr;
        if ((uint64_t)addr % MEMPAGE_SIZE) {
                tmp_addr = addr + MEMPAGE_SIZE - (uint64_t)addr % MEMPAGE_SIZE;
        }

        /*WARN: if you need free __private_mempages_addr__, this place need another global variable == addr*/
        __private_mempages_addr__ = tmp_addr;

        struct private_cpunode_mem_alloc_info_t *info;
        size_t mem_size_per_cpu_node;

        info = __cpunode_mem_alloc_init(&mem_size_per_cpu_node, total_private_mempages);

        if(!gloconf.huge_page) {
                ret = mlock(__private_mempages_addr__, MEMPAGE_SIZE * total_private_mempages);
                if (ret) {
                        DERROR("mlock mempages failed %s\n", strerror(ret));
                        GOTO(err_free, ret);
                }
        }

        size_t offset = 0;
        int cpunode_count = get_cpunode_count();
        for (int i = 0; i < cpunode_count; i++) {
                sy_spin_init(&info[i].lock);
                info[i].__addr__ = __private_mempages_addr__ + offset;
                offset += mem_size_per_cpu_node;
                info[i].socketid = i;
        }

        return 0;

err_free:
        //if(!gloconf.huge_page)
                free(addr);

err_ret:
        return ret;
}

void get_global_private_mem(void **addr, uint64_t *size)
{
        *addr = __private_mempages_addr__;
        *size = (uint64_t)cpuset_useable() * (gloconf.memcache_count + HUGEPAGE_OFFSET) * MEMPAGE_SIZE;

        return;
}

void get_global_public_mem(void **addr, uint64_t *size)
{
        *addr = __mem_info__->buddy_mem_addr;
        *size = (uint64_t)gloconf.memcache_count * MEMPAGE_SIZE;
}

static int __init_mempages(struct mem_area_desc_t *mem, size_t mempage_num)
{
        size_t i;
        struct mempage_t *mpage;

        YASSERT(mem);

        void  *buddy_start_addr = mem->buddy_mem_addr;

        mem->free_mempage_count = 0;
        mem->buffer_io_alloc_count = 0;
        mem->buddy_alloc_count = 0;

        YASSERT(buddy_start_addr && (uint64_t)buddy_start_addr % MEMPAGE_SIZE == 0);

        for(i = 0; i < mempage_num; i++) {
                mpage = &mem->mempages[i];
                mpage->idx = i;
                mpage->ref = 0;
                mpage->offset = 0;
                mpage->virtaddr = buddy_start_addr;

                if(gloconf.huge_page)
                        mpage->phyaddr = __virt2phy(buddy_start_addr);

                buddy_start_addr += MEMPAGE_SIZE;

                sy_spin_init(&mpage->lock);
        }

        return 0;
}

void *get_static_type_addr(void *private_mem, int type)
{
        YASSERT(private_mem);
        void **tmp = (void **)private_mem;

        return tmp[type];
}

static struct mem_area_desc_t *__core_private_mem_init(void *addr)
{
        void **__mem_addr__, *tmp;
        struct mem_area_desc_t *mem;
        int hp_num = gloconf.memcache_count;

        /*init mem area desc*/
        __mem_addr__ = (void **)addr;
        mem = (struct mem_area_desc_t *)(addr + STATIC_STOR_AREA_OFFSET);
        __mem_addr__[VARIABLE_HUGEPAGE] = mem;

        INIT_LIST_HEAD(&mem->buffer_io_free_list);
        INIT_LIST_HEAD(&mem->buffer_io_alloc_list);

        mem->buffer_io_alloc_count = 0;
        mem->buddy_alloc_count = 0;
        mem->free_mempage_count = 0;
        mem->non_io_allco_count = 0;

        mem->start_addr = addr;
        mem->static_stor_area = addr + STATIC_STOR_AREA_OFFSET;
        mem->static_current_offset = 0;
        mem->static_stor_area_len = 0;

        mem->buddy_addr = mem->static_stor_area
                          + sizeof(struct mem_area_desc_t)
                          + sizeof(struct mempage_t) * hp_num;

        mem->buddy_mem_addr = addr + HUGEPAGE_OFFSET * MEMPAGE_SIZE;

        tmp = register_private_static_stor_area((void *)mem->start_addr,
                                                sizeof(struct mem_area_desc_t)
                                                /*** mpages size ***/
                                                + (sizeof(struct mempage_t) * hp_num)
                                                /*** buddy tree size ***/
                                                + sizeof(struct buddy) + (2 * hp_num - 1) * sizeof(size_t),
                                                VARIABLE_HUGEPAGE);

        YASSERT(tmp == (void *)mem);
        sy_spin_init(&mem->lock);

        return mem;
}

static void __get_mem_info(int socket_id, struct private_cpunode_mem_alloc_info_t **_info)
{
        struct  private_cpunode_mem_alloc_info_t *info = NULL;

        info = &__private_cpunode_mem_alloc_info__[socket_id];
        YASSERT(socket_id >= 0);
        YASSERT(info != NULL);
        YASSERT(info->socketid == socket_id);

        *_info = info;
}

void *core_private_mem_init(int cpu_id, int core_hash)
{
        int ret, socket_id = -1;
        struct private_cpunode_mem_alloc_info_t *info = NULL;
        struct mem_area_desc_t *mem;

        ret = __get_socket_id(cpu_id, &socket_id);
        if (ret) {
                YASSERT(0);
        }

        __get_mem_info(socket_id, &info);

        ret = sy_spin_lock(&info->lock);
        if (ret)
                YASSERT(0);

        if(gloconf.huge_page)
                __alloc_hugepages(info->__addr__, socket_id, (gloconf.memcache_count + HUGEPAGE_OFFSET));

        mem = __core_private_mem_init(info->__addr__);

        info->__addr__ += MEMPAGE_SIZE * (gloconf.memcache_count + HUGEPAGE_OFFSET);

        sy_spin_unlock(&info->lock);

        ret = __init_mempages(mem, gloconf.memcache_count);
        if (ret) {
                DERROR("init mempage error\n");
                goto err_ret;
        }

        buddy_init(mem->buddy_addr, gloconf.memcache_count);

  //      core_register_tls(VARIABLE_CORE, mem->start_addr);
 /*       core_register_tls(VARIABLE_HUGEPAGE, (void *)mem);*/
        mem->core_hash = core_hash;

        variable_set(VARIABLE_HUGEPAGE, (void *)mem);

        DINFO("cpu_id %d socket_id %d\n", cpu_id, socket_id);

        return (void *)mem->start_addr;
err_ret:
        return NULL;
}

void mempages_free(void *private_addr, void *addr)
{
        size_t offset;
  //      int ret;
        void **tmp = (void **)private_addr;
        struct mem_area_desc_t *mem;
        size_t  free_size;

        if (private_addr) {
                mem = tmp[VARIABLE_HUGEPAGE];

                YASSERT(((uint64_t)(addr -  mem->buddy_mem_addr)) % MEMPAGE_SIZE == 0);
                offset = ((uint64_t)(addr -  mem->buddy_mem_addr)) / MEMPAGE_SIZE;

                YASSERT(addr == mem->mempages[offset].virtaddr);

                free_size = buddy_free(mem->buddy_addr, offset);
                if (free_size == 0)
                        YASSERT(0);

                /*ret = __change_mem_priv(addr, free_size * MEMPAGE_SIZE, PROT_NONE);
                if (ret)
                        YASSERT(0); */

        } else {
                free(addr);
        }
}

void* mempages_alloc(void *private_addr, uint32_t size)
{
        void **addr = (void **)private_addr;
        struct mem_area_desc_t *mem;
        int index,ret;
        uint32_t mpage_num;
        struct mempage_t *mpage;

        if (private_addr) {

                mem = addr[VARIABLE_HUGEPAGE];
                mpage_num = size >> 21;
                index = buddy_alloc(mem->buddy_addr, mpage_num);
                if (index < 0) {
                        ret = ENOMEM;
                        GOTO(err_ret, ret);
                }

                mpage = &mem->mempages[index];
                mem->non_io_allco_count++;

                /*ret = __change_mem_priv(mpage->virtaddr, size, PROT_READ | PROT_WRITE);
                if (ret) {
                        YASSERT(0);
                }*/
        } else {
                void *ptr;

                ret = posix_memalign(&ptr, MEMPAGE_SIZE, size);
                if (ret) {
                        YASSERT(0);
                }

                return ptr;
        }

        return mpage->virtaddr;

err_ret:
        UNIMPLEMENTED(__DUMP__);
        return NULL;
}

void *huge_tls_malloc(size_t sz)
{
        core_t * _core = core_self();
        assert(_core);
        void *private_addr = _core->tls[VARIABLE_CORE];
        assert(private_addr);

        if(sz < 2 * 1024*1024)  //todo in future.
                sz = 2 * 1024*1024;

        return mempages_alloc(private_addr, sz);
}

void huge_tls_free(void *ptr)
{
        core_t * _core = core_self();
        assert(_core);
        void *private_addr = _core->tls[VARIABLE_CORE];
        assert(private_addr);

        mempages_free(private_addr, ptr);
}

int mem_hugepage_new(uint32_t size, mem_handler_t *mem_handler)
{
        struct mem_area_desc_t *mem = mem_handler->pool;
        struct mempage_t *mpage;
        int ret, index, i, num = 4;
        uint32_t tmp_size;

        if (likely(!mem)) {
                mem = mem_self();
        } else {
               YASSERT(0);
        }

        tmp_size = _align_up(size, PAGE_SIZE);
        if (unlikely(tmp_size > MEMPAGE_SIZE)) {
                ret = ENOMEM;
                GOTO(err_ret, ret);
        }

        ret = sy_spin_lock(&mem->lock);
        if (unlikely(ret))
                GOTO(err_ret, ret);

retry:
        if (unlikely(list_empty(&mem->buffer_io_free_list))) {
                index = buddy_alloc(mem->buddy_addr, num);
                if (index < 0) {
                        if (num > 1) {
                                num /= 2;
                                goto retry;
                        }

                        mem->buddy_alloc_count ++;

                        DWARN("mem: [%p], buddy_alloc_count: [%d], io_allocate_count: [%d], free_mempage_count: [%lu], non_io_allco_count: %d \n",
                                        mem,
                                        mem->buddy_alloc_count,
                                        mem->buffer_io_alloc_count,
                                        mem->free_mempage_count,
                                        mem->non_io_allco_count);

                        DWARN("next: %p, prev: %p\n", mem->buffer_io_free_list.next, mem->buffer_io_free_list.prev);
                        int i;
                        for (i = 0; i < ((struct buddy *)mem->buddy_addr)->mpage_count; i++) {
                                DINFO("[%d] ref: %d, offset: %d\n", mem->mempages[i].idx, mem->mempages[i].ref, mem->mempages[i].offset)
                        }
                        YASSERT(0);
                        ret = ENOMEM;
                        GOTO(err_ret, ret);
                }

                mpage = &mem->mempages[index];
               /* ret = __change_mem_priv(mpage->virtaddr, ((size_t)num) * MEMPAGE_SIZE, PROT_READ |PROT_WRITE);
                if (unlikely(ret))
                        YASSERT(0); */

               //DWARN("buddy alloc mem:%p %d num %d\n", mem, index, num);

                for (i = 0; i < num; i++) {
                        mpage = &mem->mempages[index + i];
                        list_add_tail(&mpage->list, &mem->buffer_io_free_list);
                        mem->free_mempage_count++;
                }

        }

        YASSERT(!list_empty(&mem->buffer_io_free_list));
        mpage = list_entry(mem->buffer_io_free_list.next, struct mempage_t, list);


        if (unlikely(mpage->offset + tmp_size >= MEMPAGE_SIZE)) {
                if (list_del_safe(&mpage->list, &mem->buffer_io_free_list)) {
                        YASSERT(mem->free_mempage_count > 0);
                        mem->free_mempage_count--;
                } else {
                        /* mpage must in mem->buffer_io_free_list */
                        YASSERT(0);
                }
                list_add_tail(&mpage->list, &mem->buffer_io_alloc_list);
                mem->buffer_io_alloc_count++;

                if (mpage->offset + tmp_size > MEMPAGE_SIZE)
                        goto retry;
        }

        mpage->ref++;

        mem_handler->idx = mpage->idx;
        mem_handler->pool = mem;
        mem_handler->ptr = mpage->virtaddr + mpage->offset;
        mem_handler->phyaddr = mpage->phyaddr + mpage->offset;
        mpage->offset += tmp_size;
        sy_spin_unlock(&mem->lock);
        //_backtrace("hugepage");
        /*DWARN("mem: %p alloc %p pool %p idx %d offset  %d, ref %d size %u, buffer_io_alloc_count: %d, free_mempage_count: %lu\n",
                        mem, mpage->virtaddr, mem,mpage->idx, mpage->offset, mpage->ref, size, mem->buffer_io_alloc_count,
                        mem->free_mempage_count);*/

        return 0;

err_ret:
        return ret;
}

int mem_hugepage_ref(mem_handler_t *mem_handler)
{
        struct mem_area_desc_t *mem = mem_handler->pool;
        struct mempage_t *mpage;
        int ret;

	YASSERT(mem);
        ret = sy_spin_lock(&mem->lock);
        if (unlikely(ret))
                GOTO(err_ret, ret);

        mpage = &mem->mempages[mem_handler->idx];
        YASSERT(mpage->ref > 0);
        mpage->ref++;

        sy_spin_unlock(&mem->lock);

        return 0;
err_ret:
        return ret;
}

static int local_core_free(mem_handler_t *mem_handler)
{
        struct mem_area_desc_t *mem = mem_handler->pool;
        struct mempage_t *mpage;
        int ret;

        ret = sy_spin_lock(&mem->lock);
        if (ret)
                GOTO(err_ret, ret);

        mpage = &mem->mempages[mem_handler->idx];
        YASSERT(mpage->ref > 0);
        mpage->ref--;

        if (mpage->ref == 0) {
                if (list_del_safe(&mpage->list, &mem->buffer_io_alloc_list)) {
                        YASSERT(mem->buffer_io_alloc_count > 0);
                        mem->buffer_io_alloc_count--;
                        list_add_tail(&mpage->list, &mem->buffer_io_free_list);
                        mem->free_mempage_count++;
                }
                mpage->offset = 0;
#if 0
                size_t free_size, offset;
                struct list_head *pos, *n;
                list_for_each_safe(pos, n, &mem->buffer_io_free_list) {
                        if (mem->free_mempage_count <= 32)
                                break;

                        mpage = (struct mempage_t *)pos;
                        offset = ((uint64_t)(mpage->virtaddr - mem->buddy_mem_addr)) / MEMPAGE_SIZE;

                        YASSERT(mpage->virtaddr == mem->mempages[offset].virtaddr);

                        list_del(&mpage->list);
                        mem->free_mempage_count--;

                        free_size = buddy_free(mem->buddy_addr, offset);
                        if (unlikely(free_size == 0))
                                YASSERT(0);
                }
#endif
        }

        sy_spin_unlock(&mem->lock);

        return 0;
err_ret:
        return ret;
}

static int __remote_core_free(va_list ap)
{
        int ret;
        mem_handler_t *mem_handler = va_arg(ap, mem_handler_t *);

        va_end(ap);

        ret = local_core_free(mem_handler);
        if (unlikely(ret))
                GOTO(err_ret, ret);

        return 0;
err_ret:
        return ret;
}

static int remote_core_free(mem_handler_t *mem_handler)
{
        int ret;
        struct mem_area_desc_t *mem = mem_handler->pool;

        ret = core_request(mem->core_hash, -1, "mem_hugepage_deref",
                        __remote_core_free, mem_handler);
        if (unlikely(ret))
                GOTO(err_ret, ret);

        return 0;
err_ret:
        return ret;
}

int mem_hugepage_deref(mem_handler_t *mem_handler)
{
        struct mem_area_desc_t *mem = mem_handler->pool;
        int core_hash = 0;
        int ret;
        core_t *core = core_self();

	YASSERT(mem);

        ret = sy_spin_lock(&mem->lock);
        if (ret)
                GOTO(err_ret, ret);

        core_hash = mem->core_hash;
        sy_spin_unlock(&mem->lock);

        if (unlikely(core == NULL)) {
                ret = local_core_free(mem_handler);
                if (ret)
                        GOTO(err_ret, ret);
        } else {
                if (likely(core->hash == core_hash)) {
                        ret = local_core_free(mem_handler);
                        if (ret)
                                GOTO(err_ret, ret);
                } else {
                        ret = remote_core_free(mem_handler);
                        if (ret)
                                GOTO(err_ret, ret);
                }
        }

        return 0;
err_ret:
        return ret;
}

void *mem_hugepage_alloc(size_t size, unsigned long *paddr)
{
        int ret;
        mem_handler_t mem_handler;

        mem_handler.ptr  = NULL;
        mem_handler.idx  = -1;
        mem_handler.pool = NULL;

        ret = mem_hugepage_new(size, &mem_handler);
        if (unlikely(ret))
                return NULL;

        if (paddr)
                *paddr = mem_handler.phyaddr;

        return mem_handler.ptr;
}

struct mempage_t *mem_search_hp(void *vaddr)
{
        int index = 0;
        struct mempage_t *mpage = NULL;
        struct mem_area_desc_t *mem = NULL;

        mem = mem_self();

        index = ((uint64_t)(vaddr -  mem->buddy_mem_addr)) / MEMPAGE_SIZE;
        if (index >= gloconf.memcache_count || index < 0)
                return mpage;

        mpage = mem->mempages + index;
        return mpage;
}

unsigned long mem_vtophys(void *vaddr)
{
        struct mempage_t *mpage = NULL;

        mpage = mem_search_hp(vaddr);
        if (likely(mpage))
                return mpage->phyaddr + (uint64_t)(vaddr - mpage->virtaddr);

        return __virt2phy(vaddr);
}

void mem_hugepage_free(void *addr)
{
        mem_handler_t mem_handler;
        struct mempage_t *mpage = NULL;
        struct mem_area_desc_t *mem = NULL;

        if (!addr)
                return;

        mpage = mem_search_hp(addr);
        if (unlikely(!mpage)) {
                DERROR("Invalid address %p for free\n", addr);
                return;
        }

        mem = mem_self();

        mem_handler.pool = mem;
        mem_handler.idx  = mpage->idx;

        mem_hugepage_deref(&mem_handler);
}

static int __init_proc_vm(int total_hugepage_num)
{
        int ret, fd;
        char tmp[16];
        int count = 0;

        fd = open(HUGEPAGE_PROC_FILE, O_RDWR);
        if (fd < 0) {
                ret = errno;
                DERROR("open hugepage proc file [%s]failed :%s\n", HUGEPAGE_PROC_FILE, strerror(ret));
                GOTO(err_ret, ret);
        }

        ret = read(fd, tmp, 16);
        if (ret < 0) {
                ret = errno;
                DERROR("read hugepage proc file [%s]failed :%s\n", HUGEPAGE_PROC_FILE, strerror(ret));
                GOTO(err_fd, ret);
        }

        //count = atoi(tmp);

        count = gloconf.huge_page_reserved;
        
        DINFO("alloc hugepages %d, previous is: %d\n", total_hugepage_num, count);
        snprintf(tmp, 8, "%d", total_hugepage_num + count + 32);
        ret = write(fd, tmp, strlen(tmp));
        if (ret < 0) {
                ret = errno;
                DERROR("write hugepage proc file [%s]failed :%s\n", HUGEPAGE_PROC_FILE, strerror(ret));
                GOTO(err_fd, ret);
        }

        DINFO("write file %s, sz: %d, buf: %s\r\n", HUGEPAGE_PROC_FILE, ret, tmp);

        close(fd);

        return 0;

err_fd:
        close(fd);
err_ret:
        return ret;

}

static int __create_hugepage_file__(void *addr, int total_hugepage_num)
{
        int socketid, fd;
        int ret, i;
        unsigned long nodemask;
        int node_count = get_cpunode_count();
        int pages_count[node_count];
        char  path[128];


        memset(pages_count, 0x00, sizeof(int) * node_count);
        for (i = 0; i < total_hugepage_num; i++) {
                snprintf(path, 128, "%s/lich_mempages_%d", HUGEPAGE_DIR, i);
                fd = open(path, O_CREAT|O_RDWR, 0755);
                if (fd < 0) {
                        ret = errno;
                        DERROR("error %s on open %s\n", strerror(ret), path);
                        GOTO(err_ret, ret);
                }

                addr = mmap(NULL, MEMPAGE_SIZE, PROT_WRITE |PROT_READ, MAP_SHARED, fd, 0);
                if (addr == MAP_FAILED) {
                        ret =errno;
                        DERROR("error on mmap %s, pos: %d\n", strerror(ret), i);
                        GOTO(err_fd, ret);
                }

                memset(addr, 0x00, MEMPAGE_SIZE);

                nodemask = 1 << i;
                ret = mbind(addr, MEMPAGE_SIZE, MPOL_PREFERRED, &nodemask, 3, 0);
                if(ret)
                        DWARN("mbind failed with error, ret=%d, errno=%d\n", ret, errno);

                ret = __find_numasocket(addr, &socketid);
                if (ret) {
                        DERROR("error on find numasocket \n");
                        GOTO(err_fd, ret);
                }

                if(pages_count[socketid] == total_hugepage_num / node_count)
                        socketid = socketid == 0? 1: 0; /*just make it work, may be some memory crossing numa, causing a liitle performance lost..., but work!*/ 

                pages_count[socketid]++;
                mem_node_info[i] = socketid;
                munmap(addr, MEMPAGE_SIZE);   //cause aio submit to nvme error in little chances.

                close(fd);
        }

        for (i = 0; i < node_count; i++) {
                if (pages_count[i] != (total_hugepage_num / node_count)) {
                        DWARN("i %d page count %d, avg %d\n", i, pages_count[i], (total_hugepage_num / node_count));
                        ret = EAGAIN;
                        GOTO(err_ret, ret);
                }
                else
                        DWARN("i %d page count %d, avg %d\n", i, pages_count[i], (total_hugepage_num / node_count));
        }

        return 0;

err_fd:
        close(fd);
err_ret:
        return ret;
}

static int __create_hugepage_file()
{
        int ret;
        void *ptr;
        int total_hugepage_num = gloconf.memcache_count +
                                (gloconf.memcache_count + 16) * cpuset_useable();

        ret = __init_proc_vm(total_hugepage_num);
        if (ret)
                GOTO(err_ret, ret);

        ret = posix_memalign((void **)&ptr, MEMPAGE_SIZE, MEMPAGE_SIZE);
        if (ret)
                GOTO(err_ret, ret);

        YASSERT(mem_node_info == NULL);
        mem_node_info = malloc(sizeof(int) * total_hugepage_num);
        if (mem_node_info ==NULL)
                GOTO(err_free, ret);

        ret = __create_hugepage_file__(ptr, total_hugepage_num);
        if (ret)
                GOTO(err_free1, ret);

#ifdef SPDK
        ret = __init_proc_vm(total_hugepage_num + 512);
        if (ret)
                GOTO(err_ret, ret);
#endif
        return 0;

err_free1:
        free(mem_node_info);
        mem_node_info = NULL;
err_free:
        free(ptr);
err_ret:
        return ret;
}

static int __huge_dir_init()
{
        char line[256], *hugetlb_str;
        int ret;
        FILE *fp;

        ret = mkdir("/dev/hugepages", 0700);
        if (ret < 0 && errno != EEXIST) {
                DWARN("create /dev/hugepages/ dir failed %s\n", strerror(errno));
                YASSERT(0);
        }

        fp = fopen("/proc/self/mounts", "r");
        if (fp == NULL) {
                DWARN("open /proc/self/mounts failed\n");
                YASSERT(0);
        }

        DINFO("cleanup /dev/hugepages mount \n");
        while(fgets(line, 256, fp) != NULL) {
                hugetlb_str = strstr(line, "/dev/hugepages");
                if (hugetlb_str != NULL) {
                        ret = umount("/dev/hugepages");
                        if (ret != 0) {
                                DWARN("the dir /dev/hugepages umount failed\n");
                                //YASSERT(0);
                                //ignore this error.
                                return 0;
                        }

                }
        }

        DINFO("hugetlbfs filesystem  will remount \n");
        ret = mount("hugetlbfs", "/dev/hugepages", "hugetlbfs", MS_MGC_VAL, NULL);
        if (ret < 0) {
                DWARN("mount error :%s\n", strerror(errno));
                ret = EIO;
                GOTO(err_ret, ret);
        }

        return 0;
err_ret:
        return ret;
}

static int __global_mem_init(struct mem_area_desc_t **_mem)
{
        int ret;
        struct mem_area_desc_t *mem;
        void *addr;

        mem = malloc(sizeof(struct mem_area_desc_t) + sizeof(struct mempage_t) * gloconf.memcache_count);
        if (mem == NULL) {
                ret = errno;
                GOTO(err_ret, ret);
        }

        __mem_info__ = mem;

        mem->buddy_addr = malloc(2 * sizeof(size_t) * gloconf.memcache_count);
        if (mem->buddy_addr == NULL) {
                ret = errno;
                GOTO(err_ret, ret);
        }

        addr = malloc((uint64_t)(gloconf.memcache_count + 1) * MEMPAGE_SIZE);
        if (addr == NULL) {
                ret = errno;
                GOTO(err_ret, ret);
        }

        if ((uint64_t)addr % MEMPAGE_SIZE) {
                addr +=  MEMPAGE_SIZE - (uint64_t)addr % MEMPAGE_SIZE;
        }

        mem->buddy_mem_addr = addr;
        INIT_LIST_HEAD(&mem->buffer_io_alloc_list);
        INIT_LIST_HEAD(&mem->buffer_io_free_list);
        sy_spin_init(&mem->lock);
        mem->buffer_io_alloc_count = 0;
        mem->free_mempage_count = 0;

        *_mem = mem;

        return 0;
err_ret:
        return ret;
}

int global_mem_init(int daemon)
{
        int ret = 0;
        struct mem_area_desc_t *mem = NULL;

        if(gloconf.huge_page) {
                YASSERT(mem_node_info == NULL);
                if (daemon) {
                        while(1) {
                                ret = __huge_dir_init();
                                if (ret)
                                        GOTO(err_ret, ret);

                                ret = __create_hugepage_file();
                                if (ret && ret != EAGAIN)
                                        GOTO(err_ret, ret);

                                if (ret == 0)
                                        break;

                                sleep(1);
                        }
                }
        }

        ret = __global_mem_init(&mem);
        if (ret)
                GOTO(err_ret, ret);

        if(gloconf.huge_page) {
                if (daemon) {
                        ret = __alloc_hugepages(mem->buddy_mem_addr,  -1, gloconf.memcache_count);
                        if(ret) {
                                YASSERT(0);
                        }
                }
        }
        else {
                if (daemon) {
                        ret = mlock(mem->buddy_mem_addr, (uint64_t)MEMPAGE_SIZE * gloconf.memcache_count);
                        if (ret) {
                                DERROR("mlock mempages failed %s\n", strerror(ret));
                                GOTO(err_ret,ret);
                        }
                }
        }

        ret = __init_mempages(mem, gloconf.memcache_count);
        if (ret)
               GOTO(err_ret, ret);

        buddy_init(mem->buddy_addr, (size_t)gloconf.memcache_count);

        return 0;
err_ret:
        return ret;
}

size_t get_core_mempages_size()
{
        return (size_t)(gloconf.memcache_count + HUGEPAGE_OFFSET) * MEMPAGE_SIZE;
}
